library(DESeq2)
library(gplots)
library(RColorBrewer)
library(limma)
library(tidyverse)
package ‘dplyr’ was built under R version 3.5.1
# Read the sample information into R
sampleinfo <- read_tsv("../data/SampleInfo.Corrected.txt")
Parsed with column specification:
cols(
FileName = col_character(),
Sample = col_character(),
CellType = col_character(),
Status = col_character(),
Group = col_character()
)
# Read the data into R
seqdata <- read_tsv("../data/GSE60450_Lactation.featureCounts", comment = "#")
Parsed with column specification:
cols(
Geneid = col_character(),
Chr = col_character(),
Start = col_character(),
End = col_character(),
Strand = col_character(),
Length = col_integer(),
MCL1.DG.bam = col_integer(),
MCL1.DH.bam = col_integer(),
MCL1.DI.bam = col_integer(),
MCL1.DJ.bam = col_integer(),
MCL1.DK.bam = col_integer(),
MCL1.DL.bam = col_integer(),
MCL1.LA.bam = col_integer(),
MCL1.LB.bam = col_integer(),
MCL1.LC.bam = col_integer(),
MCL1.LD.bam = col_integer(),
MCL1.LE.bam = col_integer(),
MCL1.LF.bam = col_integer()
)
|= | 2%
|== | 3%
|=== | 4%
|=== | 6% 1 MB
|==== | 7% 1 MB
|==== | 8% 1 MB
|====== | 10% 1 MB
|====== | 11% 2 MB
|======= | 12% 2 MB
|======== | 14% 2 MB
|========= | 15% 2 MB
|========== | 16% 2 MB
|========== | 17% 3 MB
|=========== | 18% 3 MB
|=========== | 19% 3 MB
|============ | 20% 3 MB
|============ | 21% 3 MB
|============= | 22% 3 MB
|============== | 23% 4 MB
|============== | 24% 4 MB
|=============== | 25% 4 MB
|================ | 26% 4 MB
|================ | 27% 4 MB
|================= | 28% 5 MB
|================== | 30% 5 MB
|=================== | 31% 5 MB
|==================== | 33% 5 MB
|==================== | 34% 5 MB
|===================== | 35% 6 MB
|====================== | 37% 6 MB
|======================= | 38% 6 MB
|======================== | 40% 6 MB
|========================= | 41% 7 MB
|========================= | 42% 7 MB
|========================== | 43% 7 MB
|=========================== | 45% 7 MB
|=========================== | 46% 7 MB
|============================ | 47% 8 MB
|============================ | 48% 8 MB
|============================= | 49% 8 MB
|============================== | 50% 8 MB
|=============================== | 51% 8 MB
|=============================== | 53% 9 MB
|================================ | 54% 9 MB
|================================= | 56% 9 MB
|================================== | 57% 9 MB
|=================================== | 58% 10 MB
|==================================== | 60% 10 MB
|==================================== | 61% 10 MB
|===================================== | 62% 10 MB
|====================================== | 63% 10 MB
|====================================== | 64% 11 MB
|======================================= | 65% 11 MB
|======================================== | 66% 11 MB
|======================================== | 68% 11 MB
|========================================== | 70% 12 MB
|========================================== | 71% 12 MB
|=========================================== | 72% 12 MB
|============================================ | 74% 12 MB
|============================================= | 75% 13 MB
|============================================== | 77% 13 MB
|=============================================== | 78% 13 MB
|=============================================== | 79% 13 MB
|================================================ | 80% 13 MB
|================================================= | 82% 14 MB
|================================================= | 83% 14 MB
|================================================== | 83% 14 MB
|=================================================== | 85% 14 MB
|==================================================== | 86% 15 MB
|===================================================== | 88% 15 MB
|===================================================== | 89% 15 MB
|====================================================== | 91% 15 MB
|======================================================= | 92% 15 MB
|======================================================== | 93% 16 MB
|======================================================== | 94% 16 MB
|======================================================== | 94% 16 MB
|========================================================= | 95% 16 MB
|========================================================= | 96% 16 MB
|========================================================== | 97% 16 MB
|===========================================================| 98% 17 MB
|============================================================| 100% 17 MB
# Remove first two columns from seqdata
countdata <- as.data.frame(seqdata) %>%
column_to_rownames("Geneid") %>% # turn the geneid column into rownames
rename_all(str_remove, ".bam") %>% # remove the ".bam" from the column names
select(sampleinfo$Sample) %>% # keep sample columns using sampleinfo
as.matrix()
# filter data
keep <- rowSums(countdata) > 5
countdata <- countdata[keep,]
# rlogcounts
rlogcounts <- rlog(countdata)
# We estimate the variance for each row in the logcounts matrix
countVar <- apply(rlogcounts, 1, var)
# DGE list
design <- as.formula(~ CellType)
# create the DESeqDataSet object
ddsObj <- DESeqDataSetFromMatrix(countData = countdata,
colData = sampleinfo,
design = design)
some variables in design formula are characters, converting to factors
ddsObj <- estimateSizeFactors(ddsObj)
# normalise dcounts
logcounts <- log2(countdata + 1)
normalizedCounts <- counts(ddsObj, normalized=TRUE)
logNormalizedCounts <- log2(normalizedCounts + 1)
Challenge 1
- Use the
DESeq2functionrlogto transform the count data. This function also normalises for library size.- Plot the count distribution boxplots with this data How has this effected the count distributions?
rlogcounts <- rlog(countdata)
rownames(rlogcounts) <- rownames(countdata)
# reform the table and add sample data
plotData <- rlogcounts %>%
as.data.frame() %>%
gather("Sample", "logCounts") %>%
left_join(sampleinfo, "Sample")
# Check distributions of samples using boxplots
# Add a horizontal line at the overall median
medianCount <- median(plotData$logCounts)
ggplot(plotData, aes(x=Sample, y=logCounts, fill=Group)) +
geom_boxplot() +
geom_hline(yintercept = medianCount, colour = "blue")
Challenge 2
Redo the heatmap using the top 500 LEAST variable genes. Change the colour scheme to “PiYG” and redo the heatmap. Try
?RColorBrewerand see what other colour schemes are available. Change the sample names togroupusing thelabColargument Remove the gene names from the righthand side of the plot usinglabRow
Solution
# Get the gene names for the top 500 least variable genes
lowVar <- order(countVar)[1:500]
# Subset logcounts matrix
hmData <- rlogcounts[lowVar,]
## Get some nicer colours
mypalette <- brewer.pal(11,"PiYG")
## http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
morecols <- colorRampPalette(mypalette)
# Set up colour vector for celltype variable
col.cell <- c("purple","orange")[sampleinfo$CellType]
# Plot the heatmap
heatmap.2(hmData,
col=rev(morecols(50)),
trace="none",
main="Top 500 most variable genes across samples",
ColSideColors=col.cell,scale="row",
labCol=sampleinfo$Group,
labRow = NA)
Challenge 3
Plot the biased and unbiased MA plots both samples side by side to see the before and after normalisation.